{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 利用聚类方法，根据企业的文本描述对企业进行分类\n",
    "\n",
    "文件说明:\n",
    "    每个训练样本包含两个字段，分别为企业类别标签（1～10）和企业的文字描述（文本）。\n",
    "    \n",
    "训练数据示例：\n",
    "    2\t公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
    "    1\t公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖商业定位及策划、商业设计、销售代理、招商代理电子商务、以及商业地产运管服务；同时开展应用互联网电商模式，采取O2O线上导流线下服务方式进行住宅类业务的创新营销服务。公司的业务板块包括商业地产策划顾问、专业招商及运营管理、代理销售、麦吉铺O2O电子商务。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读入数据\n",
    "data = pd.read_csv('training.csv')\n",
    "\n",
    "#将数据分成两个属性\n",
    "data.columns = ['type', 'text']\n",
    "\n",
    "data['type'] = data['type'].map(lambda s: int(s) - 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4773 entries, 0 to 4772\n",
      "Data columns (total 2 columns):\n",
      "type    4773 non-null int64\n",
      "text    4773 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4773.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.070186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.286824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              type\n",
       "count  4773.000000\n",
       "mean      4.070186\n",
       "std       2.286824\n",
       "min       0.000000\n",
       "25%       2.000000\n",
       "50%       3.000000\n",
       "75%       5.000000\n",
       "max      10.000000"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0,0.5,'Number of type')"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEKCAYAAAAFJbKyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAF4tJREFUeJzt3X20XXV95/H3xwAKiIaHKwsTarBmrIytwmQYLC60UhXUEkVpoT5ExMaO6KB2tQbHkbZOV3WqYqmWygg1KkIxoEktPkQUXW0XDwEfCAYlokIEyVUQqI4i+p0/9r7mGG5uzs6955x7ue/XWnedvX/7d873d1bgfu5++u1UFZIk9eshox6AJGluMTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI62W3UAxiEAw44oJYsWTLqYUjSnHLttdd+v6rGdtbvQRkcS5YsYcOGDaMehiTNKUm+008/D1VJkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjp5UN45Pt994vzjBl7jea/45KTtZ33k2QOt+/o//PRAP1/SzrnHIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKmTgQVHkvOTbE2ysaftb5LcmOSrST6WZGHPtjOSbE7y9STP7mk/tm3bnGTVoMYrSerPIPc4PgAcu13beuCJVfVbwDeAMwCSHAqcBPzn9j1/n2RBkgXAe4HjgEOBk9u+kqQRGVhwVNUXgTu3a/tMVd3frl4JLG6XlwMXVdVPq+pbwGbgiPZnc1XdXFX3ARe1fSVJIzLKcxyvACbmrVgE3NqzbUvbtqP2B0iyMsmGJBvGx8cHMFxJEowoOJL8T+B+4IKJpkm61RTtD2ysOreqllXVsrGxsZkZqCTpAYY+yWGSFcDzgGOqaiIEtgAH93RbDNzWLu+oXZI0AkPd40hyLPBG4Piq+nHPpnXASUkemuQQYClwNXANsDTJIUn2oDmBvm6YY5Yk/aqB7XEkuRB4OnBAki3AmTRXUT0UWJ8E4Mqq+uOquiHJxcDXaA5hnVZVP28/5zXAp4EFwPlVdcOgxixJ2rmBBUdVnTxJ83lT9P8r4K8mab8MuGwGhyZJmgbvHJckdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUycCCI8n5SbYm2djTtl+S9Ulual/3bduT5Owkm5N8NcnhPe9Z0fa/KcmKQY1XktSfQe5xfAA4dru2VcDlVbUUuLxdBzgOWNr+rATOgSZogDOB/wYcAZw5ETaSpNEYWHBU1ReBO7drXg6sbpdXA8/vaf9gNa4EFiY5CHg2sL6q7qyqu4D1PDCMJElDNOxzHAdW1e0A7euj2vZFwK09/ba0bTtqlySNyGw5OZ5J2mqK9gd+QLIyyYYkG8bHx2d0cJKkbYYdHHe0h6BoX7e27VuAg3v6LQZum6L9Aarq3KpaVlXLxsbGZnzgkqTGsINjHTBxZdQKYG1P+8vaq6uOBO5uD2V9GnhWkn3bk+LPatskSSOy26A+OMmFwNOBA5Jsobk66m3AxUlOBW4BTmy7XwY8B9gM/Bg4BaCq7kzyVuCatt9fVtX2J9wlSUM0sOCoqpN3sOmYSfoWcNoOPud84PwZHJokaRpmy8lxSdIcYXBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnfQVHEmemuSUdnksySGDHZYkabbaaXAkORN4I3BG27Q78OFBDkqSNHv1s8fxAuB44EcAVXUbsM8gByVJmr36CY772mnPCyDJ3oMdkiRpNusnOC5O8j5gYZI/Aj4L/N/BDkuSNFvt9EFOVfWOJM8E7gH+E/CWqlo/8JFJkmalfp8AeD2wJ83hqusHNxxJ0mzXz1VVrwSuBk4AXgRcmeQVgx6YJGl26meP40+Bw6rqBwBJ9gf+HZ8DLknzUj8nx7cA9/as3wvcOpjhSJJmu372OL4LXJVkLc05juXA1UneAFBV7xrg+CRJs0w/wfHN9mfC2vbVmwAlaR7qJzguqaqNAx+JJGlO6Occxz8kuTrJq5MsnImiSV6f5IYkG5NcmORhSQ5JclWSm5L8U5I92r4Pbdc3t9uXzMQYJEm7ZqfBUVVPBV4CHAxsSPKRJM/a1YJJFgH/A1hWVU8EFgAnAW8HzqqqpcBdwKntW04F7qqqxwFntf0kSSPS1w2AVfWNJG8GNgBnA4clCfCmqrp0F+vumeRnwF7A7cAzgD9st68G/hw4h+Zk/J+37WuA9yRJO3+WNHLPveR9A/38f3nhqwb6+VJX/dwA+FtJzgI20fxy/72qekK7fFbXglX1XeAdwC00gXE3cC3ww6q6v+22BVjULi+ivfy33X43sP8k41yZZEOSDePj412HJUnqUz/nON4DXAc8qapOq6rr4JfTq7+5a8Ek+9LsRRwCPBrYGzhukq4TexSZYtu2hqpzq2pZVS0bGxvrOixJUp/6CY5Lq+pDVfX/JhqSnA5QVR/ahZq/C3yrqsar6mfApcBv08y+O3HobDFwW7u8heb8Cu32RwJ37kJdSdIM6Cc4XjZJ28unUfMW4Mgke7XnSY4BvgZ8nmYuLIAVbLtfZF27Trv9c57fkKTR2eHJ8SQn05ysPiTJup5N+wA/2NWCVXVVkjU0h7/uB74EnAv8C3BRkv/dtp3XvuU84ENJNtPsaZy0q7UlSdM31VVV/05z8voA4J097fcCX51O0ao6Ezhzu+abgSMm6fsT4MTp1JMkzZwdBkdVfQf4DvCU4Q1HkjTb9XOOQ5KkXzI4JEmd7DA4klzevjrFhyTpl6Y6OX5QkqcBxye5iO1uxJu4EVCSNL9MFRxvAVbR3Iy3/cOaimbKEUnSPDPVVVVrgDVJ/ldVvXWIY5IkzWI7nR23qt6a5Hjg6Lbpiqr6xGCHJUmarfqZHfevgdNppgX5GnB62yZJmof6eR7Hc4EnV9UvAJKsppkS5IxBDkySNDv1ex9H7yNjHzmIgUiS5oZ+9jj+GvhSks/TXJJ7NO5tSNK81c/J8QuTXAH8V5rgeGNVfW/QA5MkzU79PnP8dprnYkiS5jnnqpIkdWJwSJI6mTI4kjwkycZhDUaSNPtNGRztvRtfSfJrQxqPJGmW6+fk+EHADUmuBn400VhVxw9sVJKkWauf4PiLgY9CkjRn9HMfxxeSPAZYWlWfTbIXsGDwQ5MkzUb9THL4R8Aa4H1t0yLg44MclCRp9urnctzTgKOAewCq6ibgUYMclCRp9uonOH5aVfdNrCTZjeYJgLssycIka5LcmGRTkqck2S/J+iQ3ta/7tn2T5Owkm5N8Ncnh06ktSZqefoLjC0neBOyZ5JnAR4F/nmbdvwU+VVW/ATwJ2ETzmNrLq2opcHm7DnAcsLT9WQmcM83akqRp6Cc4VgHjwPXAq4DLgDfvasEkj6CZYfc8gKq6r6p+CCwHVrfdVgPPb5eXAx+sxpXAwiQH7Wp9SdL09HNV1S/ahzddRXOI6utVNZ1DVY+lCaJ/TPIk4FqaJwwe2E6mSFXdnmTiPMoi4Nae929p226fxhgkSbuon6uqngt8EzgbeA+wOclx06i5G3A4cE5VHUZzU+GqKfpnkrYHBFeSlUk2JNkwPj4+jeFJkqbSz6GqdwK/U1VPr6qnAb8DnDWNmluALVV1Vbu+hiZI7pg4BNW+bu3pf3DP+xcDt23/oVV1blUtq6plY2Nj0xieJGkq/QTH1qra3LN+M9t+qXfWPgTq1iSPb5uOAb5G87yPFW3bCmBtu7wOeFl7ddWRwN0Th7QkScO3w3McSU5oF29IchlwMc0hohOBa6ZZ97XABUn2oAmiU2hC7OIkpwK3tHWgORn/HGAz8OO2ryRpRKY6Of57Pct3AE9rl8eBfadTtKq+DCybZNMxk/QtmpsQJUmzwA6Do6r8y16S9AA7vRw3ySE0h5aW9PZ3WnVJmp/6mVb94zQ36/0z8IvBDkeSNNv1Exw/qaqzBz4SSdKc0E9w/G2SM4HPAD+daKyq6wY2KknSrNVPcPwm8FLgGWw7VFXtuiRpnuknOF4APLZ3anVJ0vzVz53jXwEWDnogkqS5oZ89jgOBG5Ncw6+e4/ByXEmah/oJjjMHPgpJ0pzRz/M4vjCMgUiS5oZ+7hy/l23Pv9gD2B34UVU9YpADkyTNTv3scezTu57k+cARAxuRJGlW6+eqql9RVR/Hezgkad7q51DVCT2rD6GZDn06zxyXJM1h/VxV1ftcjvuBbwPLBzIaSdKs1885Dp/LIUn6pakeHfuWKd5XVfXWAYxHkjTLTbXH8aNJ2vYGTgX2BwwOSZqHpnp07DsnlpPsA5wOnAJcBLxzR++TJD24TXmOI8l+wBuAFwOrgcOr6q5hDEySNDtNdY7jb4ATgHOB36yq/xjaqCRJs9ZUNwD+CfBo4M3AbUnuaX/uTXLPcIYnSZptpjrH0fmucknSg9/IwiHJgiRfSvKJdv2QJFcluSnJPyXZo21/aLu+ud2+ZFRjliSNMDhortLa1LP+duCsqloK3EVz2S/t611V9TjgrLafJGlERhIcSRYDzwXe366HZuLENW2X1cDz2+Xl7Trt9mPa/pKkERjVHse7gT8DftGu7w/8sKrub9e3AIva5UXArQDt9rvb/pKkERh6cCR5HrC1qq7tbZ6ka/WxrfdzVybZkGTD+Pj4DIxUkjSZUexxHAUcn+TbNHehP4NmD2RhkomrvBYDt7XLW4CDAdrtjwTu3P5Dq+rcqlpWVcvGxsYG+w0kaR4benBU1RlVtbiqlgAnAZ+rqhcDnwde1HZbAaxtl9e167TbP1dVPg9EkkZkNt2r8UbgDUk205zDOK9tPw/Yv21/A7BqROOTJNHfg5wGpqquAK5ol29mkmeZV9VPgBOHOjBJ0g7Npj0OSdIcYHBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoZ6ZQjknbd8WvW7rzTNKx70fKBfr7mLvc4JEmdGBySpE4MDklSJwaHJKkTT45L0k7c+Pd3DLzGb7z6wIHXmCnucUiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqxPs49KBw3No/HniNTy7/h4HXkOaCoe9xJDk4yeeTbEpyQ5LT2/b9kqxPclP7um/bniRnJ9mc5KtJDh/2mCVJ24ziUNX9wJ9U1ROAI4HTkhwKrAIur6qlwOXtOsBxwNL2ZyVwzvCHLEmaMPRDVVV1O3B7u3xvkk3AImA58PS222rgCuCNbfsHq6qAK5MsTHJQ+zmSRuDESzYO9PM/+sInDvTzNT0jPTmeZAlwGHAVcOBEGLSvj2q7LQJu7XnblrZt+89amWRDkg3j4+ODHLYkzWsjC44kDwcuAV5XVfdM1XWStnpAQ9W5VbWsqpaNjY3N1DAlSdsZSXAk2Z0mNC6oqkvb5juSHNRuPwjY2rZvAQ7uefti4LZhjVWS9KtGcVVVgPOATVX1rp5N64AV7fIKYG1P+8vaq6uOBO72/IYkjc4o7uM4CngpcH2SL7dtbwLeBlyc5FTgFuDEdttlwHOAzcCPgVOGO1xJUq9RXFX1r0x+3gLgmEn6F3DaQAclSeqbd45LmjMuvuT7A/3833/hAQP9/AcL56qSJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOnHKkQH65t8tH+jn//pr1+68k6Q57Y53Xz3Qzz/wdUd0fo97HJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdfKgvxx3/JwPD7zG2H9/ycBrSNJs4R6HJKkTg0OS1InBIUnqxOCQJHUyZ4IjybFJvp5kc5JVox6PJM1XcyI4kiwA3gscBxwKnJzk0NGOSpLmpzkRHMARwOaqurmq7gMuAgY79awkaVJzJTgWAbf2rG9p2yRJQ5aqGvUYdirJicCzq+qV7fpLgSOq6rU9fVYCK9vVxwNfn0bJA4DvT+P9c63uKGvPt7qjrO13nh+1p1P3MVU1trNOc+XO8S3AwT3ri4HbejtU1bnAuTNRLMmGqlo2E581F+qOsvZ8qzvK2n7n+VF7GHXnyqGqa4ClSQ5JsgdwErBuxGOSpHlpTuxxVNX9SV4DfBpYAJxfVTeMeFiSNC/NieAAqKrLgMuGVG5GDnnNobqjrD3f6o6ytt95ftQeeN05cXJckjR7zJVzHJKkWcLg6DGqaU2SnJ9ka5KNw6rZ1j04yeeTbEpyQ5LTh1j7YUmuTvKVtvZfDKt2W39Bki8l+cQQa347yfVJvpxkw7DqtrUXJlmT5Mb23/spQ6j5+Pa7Tvzck+R1g67bU//17X9bG5NcmORhQ6p7elvzhkF/38l+dyTZL8n6JDe1r/vOeOGq8qc5XLcA+CbwWGAP4CvAoUOqfTRwOLBxyN/5IODwdnkf4BtD/M4BHt4u7w5cBRw5xO/+BuAjwCeGWPPbwAHD/Dfuqb0aeGW7vAewcMj1FwDfo7lPYBj1FgHfAvZs1y8GXj6Euk8ENgJ70ZxD/iywdID1HvC7A/g/wKp2eRXw9pmu6x7HNiOb1qSqvgjcOYxa29W9vaqua5fvBTYxpDvyq/Ef7eru7c9QTrglWQw8F3j/MOqNWpJH0PyCOQ+gqu6rqh8OeRjHAN+squ8MseZuwJ5JdqP5RX7bTvrPhCcAV1bVj6vqfuALwAsGVWwHvzuW0/yhQPv6/Jmua3BsM6+nNUmyBDiM5i//YdVckOTLwFZgfVUNq/a7gT8DfjGkehMK+EySa9uZDoblscA48I/t4bn3J9l7iPWhuffqwmEVq6rvAu8AbgFuB+6uqs8MofRG4Ogk+yfZC3gOv3rz8jAcWFW3Q/PHIfComS5gcGyTSdrmxSVnSR4OXAK8rqruGVbdqvp5VT2ZZiaAI5I8cdA1kzwP2FpV1w661iSOqqrDaWZ5Pi3J0UOquxvN4Yxzquow4Ec0hzCGor1p93jgo0OsuS/NX96HAI8G9k7ykkHXrapNwNuB9cCnaA553z/ousNmcGyz02lNHoyS7E4TGhdU1aWjGEN72OQK4NghlDsKOD7Jt2kORz4jyYeHUJequq193Qp8jObw6DBsAbb07NGtoQmSYTkOuK6q7hhizd8FvlVV41X1M+BS4LeHUbiqzquqw6vqaJrDSDcNo26PO5IcBNC+bp3pAgbHNvNuWpMkoTnuvamq3jXk2mNJFrbLe9L8j37joOtW1RlVtbiqltD8G3+uqgb+l2iSvZPsM7EMPIvmsMbAVdX3gFuTPL5tOgb42jBqt05miIepWrcARybZq/3v/Biac3gDl+RR7euvAScw/O++DljRLq8A1s50gTlz5/ig1QinNUlyIfB04IAkW4Azq+q8IZQ+CngpcH17rgHgTdXcpT9oBwGr24d0PQS4uKqGdmnsCBwIfKz5HcZuwEeq6lNDrP9a4IL2j6KbgVOGUbQ9zv9M4FXDqDehqq5Ksga4juZQ0ZcY3p3clyTZH/gZcFpV3TWoQpP97gDeBlyc5FSaAD1xxuu2l2xJktQXD1VJkjoxOCRJnRgckqRODA5JUicGhySpE4NDmgHt7LOvHvU4pGEwOKSZsRAwODQvGBzSzHgb8Ovtcyc+muSXMysnuSDJ8UlenmRtkk+1z305s6fPS9rnk3w5yfvaGyOlWcngkGbGKpppw58MvIf2zuwkj6SZI2nibvwjgBcDTwZOTLIsyROAP6CZBPHJwM/bPtKs5JQj0gyrqi8keW87Z9EJwCXtlDbQTB//A4AklwJPpZkS478A17R99mQAE9NJM8XgkAbjQzR7DScBr+hp336On6KZ0n91VZ0xpLFJ0+KhKmlm3Evz+N0JHwBeB7DdZJnPbJ8JvSfNk9n+DbgceFHPrKr7JXnMUEYt7QL3OKQZUFU/SPJvSTYCn6yqP02yCfj4dl3/lWZv5HE0M+RuAEjyZpqnAz6EdlZVYJiPWZX65uy40gC004lfDxxeVXe3bS8HllXVa0Y5Nmm6PFQlzbAkEw+l+ruJ0JAeTNzjkCR14h6HJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmd/H8R2pVVMwb06wAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xe7293c8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(data['type'])\n",
    "pyplot.xlabel('type')\n",
    "pyplot.ylabel('Number of type')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从图中我们可以看出，类别的分类很不均匀，主要集中在2、3、5类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 对数据做jieba分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "\n",
    "#停用词词表\n",
    "stopwords = open('stopwords.txt')\n",
    "\n",
    "# 把文本分词并去除停用词，返回数组\n",
    "def jiebaclearText(words):\n",
    "    seg_list = jieba.cut(words, cut_all = False)\n",
    "    mywordlist = []\n",
    "    for str1 in seg_list:\n",
    "        if str1 not in stopWords:\n",
    "            mywordlist.append(str1)\n",
    "    return ' '.join(mywordlist) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['text'] = data['text'].apply(lambda x: jiebaclearText(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>公司 主营业务 向 中小 微 企业 个体 工商户 农户 等 客户 提供 贷款 服务 自 设立...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司 经 工商管理 部门 核准 经营范围 “ 投资 咨询 经济 信息 咨询 企业 管理 咨询...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>该 公司 主营业务 在 中国 境内 ( 港 澳 台 除外 ) 开展 保险代理 销售 依托 于...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>公司 主营业务 地铁 商业 物业 租赁 与 运营 管理 服务 公司 以 整体 租赁 方式 取...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   type                                               text\n",
       "0     1  公司 主营业务 向 中小 微 企业 个体 工商户 农户 等 客户 提供 贷款 服务 自 设立...\n",
       "1     0  公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...\n",
       "2     1  公司 经 工商管理 部门 核准 经营范围 “ 投资 咨询 经济 信息 咨询 企业 管理 咨询...\n",
       "3     1  该 公司 主营业务 在 中国 境内 ( 港 澳 台 除外 ) 开展 保险代理 销售 依托 于...\n",
       "4     0  公司 主营业务 地铁 商业 物业 租赁 与 运营 管理 服务 公司 以 整体 租赁 方式 取..."
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采用TFIDF进行特征描述"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n",
    "cv = TfidfVectorizer()\n",
    "cv_fit=cv.fit_transform(data['text'])\n",
    "term2id_dict = cv.vocabulary_\n",
    "x=cv_fit.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4773, 30732)"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4773,)"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = data['text']\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# KMeans聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 26.26424713926408\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 20.231537470394958\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 16.270895973332063\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 13.77888634326657\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 8.659570316428344\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 6.832333995403896\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 6.020025872415452\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5, 10, 15, 20, 30,40,50]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K,x)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAD8CAYAAABw1c+bAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAH7hJREFUeJzt3XmUVNW1x/HvZpBBTABtkdFGYwjEYKPNoEYQBASCAckgJFGecUlUMBpHUBPjHJxNNCqJir5lVJ6IUcEBiQTxabBBUXgYUQPagoCiUaOiwH5/nNuri6Yaiq7qutV1f5+1alXVqVtVu++CfW+de87Z5u6IiEhyNIo7ABERyS8lfhGRhFHiFxFJGCV+EZGEUeIXEUkYJX4RkYRR4hcRSRglfhGRhFHiFxFJmCZxB5DOXnvt5aWlpXGHISLSYCxevPh9dy/JZNuCTPylpaVUVFTEHYaISINhZqsz3VZdPSIiCaPELyKSMEr8IiIJo8QvIpIwSvwiIgmjxC8ikjBK/CIiCVM0if/LL+Gaa+C55+KORESksBVN4t+8GX7/ezjjDNi6Ne5oREQKV9Ek/pYtYepUWLwY7r477mhERApX0SR+gHHj4NBD4YIL4JNP4o5GRKQwFVXiN4ObboL33oMrr4w7GhGRwlRUiR+gd2844QS4/np46624oxERKTxFl/ghnO03aQLnnht3JCIihacoE3/HjjBlCjz0EMyfH3c0IiKFpSgTP8DZZ8O++8KZZ8KWLXFHIyJSOIo28bdoAVdfDUuXwh13xB2NiEjh2GniN7POZvaMma0ws+VmdkbU/lsze9fMXo5uI2p5/zAz+6eZvWFmk3P9B+zIj34ERxwBF10E//53Pr9ZRKRwZXLGvxk42927A/2AiWbWI3rtBncvi25zar7RzBoDtwDDgR7AuJT31jszuPFGeP99uOyyfH2riEhh22nid/e17r4kevwJsALomOHn9wHecPe33P1L4H5gVF2DrYuDD4YTTwzLOaxcmc9vFhEpTLvUx29mpUAv4B9R0yQze8XM7jSzNmne0hF4J+V5JZkfNHLmiiugefNwwVdEJOkyTvxm1gqYCZzp7h8DtwL7A2XAWuC6dG9L0+a1fP4EM6sws4oNGzZkGlZG9tkHLrwQHn0U5s7N6UeLiDQ4GSV+M2tKSPr3uvtDAO6+zt23uPtW4E+Ebp2aKoHOKc87AWvSfYe7T3P3cncvLykp2ZW/ISNnngn77Qe/+lVYyVNEJKkyGdVjwB3ACne/PqW9fcpmxwLL0rz9ReAAM+tqZrsBY4FHsgu5bpo1g2uvheXL4fbb44hARKQwZHLGfzhwPDCoxtDNq83sVTN7BRgI/ArAzDqY2RwAd98MTAKeJFwUnuHuy+vjD8nE6NEwcCD85jewcWNcUYiIxMvc03a5x6q8vNwrKirq5bNfeQV69YJJk8JKniIixcDMFrt7eSbbFu3M3dr07Aknnwy33AIrVsQdjYhI/iUu8UOYzNWqFZx1VtyRiIjkXyITf0lJ6Od/4gmYs918YxGR4pbIxA+hj/+b3wxn/V99FXc0IiL5k9jEv9tucN118M9/hv5+EZGkSGziB/je92DoULjkkrCQm4hIEiQ68ZvBDTfAJ5+EPn8RkSRIdOIH6NEDTj01zOZ99dW4oxERqX+JT/wAv/0tfP3rYR2fApzPJiKSU0r8wJ57hn7+efPgkVhWEhIRyR8l/sgpp0D37mHN/k2b4o5GRKT+KPFHmjYNF3rffDNU6xIRKVZK/CmOPjoM8bzsMli3Lu5oRETqhxJ/DdddB59/DhddFHckIiL1Q4m/hm7d4PTT4Y474KWX4o5GRCT3MqnA1dnMnjGzFWa23MzOiNqvMbPXomLrs8ysdS3vXxUVbHnZzOpnkf0c+/WvoW3bUK5RwztFpNhkcsa/GTjb3bsD/YCJZtYDmAsc6O49gdeBKTv4jIHuXpZpkYC4tWkT+vkXLICZM+OORkQkt3aa+N19rbsviR5/Qiih2NHdn4pKKwK8QCikXjROPhkOPBDOPRe++CLuaEREcmeX+vjNrBToBfyjxks/Bx6v5W0OPGVmi81swq4GGJcmTeDGG2HVqjDMU0SkWGSc+M2sFTATONPdP05pv5DQHXRvLW893N0PBoYTuon61/L5E8yswswqNmzYkPEfUJ+OOgpGjYIrroC1a+OORkQkNzJK/GbWlJD073X3h1LaxwMjgZ96LVXb3X1NdL8emAX0qWW7ae5e7u7lJSUlu/ZX1KNrr4Uvv4QLLog7EhGR3MhkVI8BdwAr3P36lPZhwPnA9939s1reu7uZ7VH1GBgKLMtF4PnyjW+E0T3Tp0NFgxiTJCKyY5mc8R8OHA8MioZkvmxmI4CbgT2AuVHbbQBm1sHMqirZtgMWmtlSYBEw292fyP2fUb8uugj23lvDO0WkODTZ2QbuvhCwNC+lLVMede2MiB6/BRyUTYCF4GtfC/38J58MDzwAY8fGHZGISN1p5m6GTjwRysrgvPPgs7QdWyIiDYMSf4YaN4abboJ33gkXfEVEGiol/l3Qvz/88IcwdSpUVsYdjYhI3Sjx76JrroEtW2Dy5LgjERGpGyX+XVRaGqp03XsvvPBC3NGIiOw6Jf46mDIF2reHM86ArVvjjkZEZNco8ddBq1Zw1VWwaFE48xcRaUiU+Ovo+OOhd+/Q1//pp3FHIyKSOSX+OmrUKKzeuWZNGOUjItJQKPFn4bDDYNy4MK5/9eq4oxERyYwSf5amTgWzMKNXRKQhUOLPUufOIenPmAHPPht3NCIiO6fEnwPnnQedOoXVOzW8U0QKnRJ/DrRsGbp8liwJ6/aLiBQyJf4cGTcODj00VOr6+OOdby8iEhcl/hwxC6t3rlsHV14ZdzQiIrXLpPRiZzN7xsxWmNlyMzsjam9rZnPNbGV036aW94+PtlkZ1egtWr17wwknwA03wOuvxx2NiEh6mZzxbwbOdvfuQD9gopn1ACYD89z9AGBe9HwbZtYWuBjoSyiyfnFtB4hicdVVsPvu8P3vwwcfxB2NiMj2dpr43X2tuy+JHn8CrAA6AqOAu6PN7gZGp3n70cBcd9/o7h8Cc4FhuQi8UHXoAH/9K/zrXzB6NHzxRdwRiYhsa5f6+M2sFOgF/ANo5+5rIRwcgL3TvKUj8E7K88qoLd1nTzCzCjOr2LBhw66EVXCOOALuuQcWLoTx4zXEU0QKS8aJ38xaATOBM90903Er6Yq0e7oN3X2au5e7e3lJSUmmYRWs446Dq68OE7tUtEVECklGid/MmhKS/r3u/lDUvM7M2kevtwfWp3lrJdA55XknYE3dw21YzjkHTjstVO364x/jjkZEJMhkVI8BdwAr3P36lJceAapG6YwH/prm7U8CQ82sTXRRd2jUlghVQzxHjoTTT4dHH407IhGRzM74DweOBwaZ2cvRbQTwO2CIma0EhkTPMbNyM/szgLtvBC4DXoxul0ZtidGkCdx/P/TqBWPHwosvxh2RiCSduaftco9VeXm5V1RUxB1GTr33XpjZ+9lnoVZv165xRyQixcTMFrt7eSbbauZunuyzD8yZA199BcOHw8ZE/e4RkUKixJ9H3bvDww+HMf6jRmmMv4jEQ4k/z/r3h7vvDmP8/+u/NMZfRPKvSdwBJNHYsfD223D++VBaCr/7XdwRiUiSKPHH5NxzQ5fP1Kmw775w6qlxRyQiSaHEHxMz+MMf4J13YNKkUMJx5Mi4oxKRJFAff4xSx/gfdxwU2QhWESlQSvwxa9UKHnsMSkrge98L3T8iIvVJib8A7LMPPP44fPkljBgBH34Yd0QiUsyU+AtE9+5hHf+33grr+G/aFHdEIlKslPgLSP/+MH06LFigMf4iUn80qqfAjBsHq1fDlClhjP9VV8UdkYgUGyX+AnT++bBqVZjYte++cMopcUckIsVEib8AmcHNN4cx/hMnQqdOGuMvIrmjPv4C1aQJPPAAlJWFMf6LF8cdkYgUi0wqcN1pZuvNbFlK2wMpRVlWmdnLtbx3lZm9Gm2n6Um7qFUrmD07jPEfOTJ0/4iIZCuTM/7pwLDUBnc/zt3L3L2MUIv3oXRvjAyMts2oQIBsq2od/y++0Bh/EcmNnSZ+d18ApC0bEtXj/TFwX47jkhQ9eoR1/N98E449VmP8RSQ72fbxHwGsc/eVtbzuwFNmttjMJmT5XYk2YADcdRf8/e9w4oka4y8idZftqJ5x7Phs/3B3X2NmewNzzey16BfEdqIDwwSALl26ZBlWcfrJT8IY/wsuCGP8r7wy7ohEpCGq8xm/mTUBxgAP1LaNu6+J7tcDs4A+O9h2mruXu3t5SUlJXcMqepMnw4QJYWLX7bfHHY2INETZdPUMBl5z98p0L5rZ7ma2R9VjYCiwLN22kjkzuOWWcKH3tNPChV8RkV2RyXDO+4DngW5mVmlmJ0UvjaVGN4+ZdTCzqlTUDlhoZkuBRcBsd38id6EnV+oY/x//GJYsiTsiEWlIzN3jjmE75eXlXqGqJDu1di306xeWc37hhbC8g4gkk5ktznTYvGbuNmDt24d1/D//HIYP1xh/EcmMEn8D16MHzJoFb7wBY8ZojL+I7JwSfxEYODCM8Z8/H37+cyjA3jsRKSBanbNI/PSnYYz/hReGMf5XXBF3RCJSqJT4i8iUKWEhtyuvDMn/5JPjjkhECpESfxExgz/+Mazjf+qpYR3/4cPjjkpECo36+ItMkyYwYwb07Ak/+hH8z//EHZGIFBol/iK0xx5hHf8DDwwTvE46CT79NO6oRKRQKPEXqfbt4dlnw8Xeu+6CQw7RDF8RCZT4i1jTpnD55fDMM/DZZ2GW77XXaklnkaRT4k+AAQNg6VI45hg491wYNiws9yAiyaTEnxBt28KDD8K0abBwYbj4+9hjcUclInFQ4k8QszC2f8mSMNTzmGPg9NPDWj8ikhxK/An0rW+F1TzPOgtuvhn69IFlqpQgkhhK/AnVrBlcdx088QRs2AC9e4fJX1rnR6T4KfEn3NFHwyuvhIXeJk6EUaPg/ffjjkpE6lMmFbjuNLP1ZrYspe23Zvaumb0c3UbU8t5hZvZPM3vDzCbnMnDJnb33DhO+broJnnwyXPh9+um4oxKR+pLJGf90YFia9hvcvSy6bVf51cwaA7cAw4EewDgz65FNsFJ/zOCXv4RFi6B1axgyBM47L1T3EpHistPE7+4LgI11+Ow+wBvu/pa7fwncD4yqw+dIHh10EFRUwCmnwDXXwGGHweuvxx2ViORSNn38k8zslagrqE2a1zsC76Q8r4za0jKzCWZWYWYVGzZsyCIsyVbLlnDrraGy17/+BQcfHJZ90IVfkeJQ18R/K7A/UAasBa5Ls42laas1dbj7NHcvd/fykpKSOoYluTR6dLjw26dPqOw1dix89FHcUYlItuqU+N19nbtvcfetwJ8I3To1VQKdU553AtbU5fskPh07wty5cNVV8NBDoSto4cK4oxKRbNQp8ZtZ+5SnxwLppv+8CBxgZl3NbDdgLPBIXb5P4tW4MUyeDM89FxZ+GzAALr4YNm+OOzIRqYtMhnPeBzwPdDOzSjM7CbjazF41s1eAgcCvom07mNkcAHffDEwCngRWADPcfXk9/R2SB336wEsvwc9+BpdeGg4Aq1bFHZWI7CrzArxiV15e7hUVFXGHITvwl7+E8o4At98e+v9FJD5mttjdyzPZVjN3pU5+8hN4+WX49rdh3Dg48UT45JO4oxKRTCjxS5117QoLFsBvfgP33BOGfb74YtxRicjOKPFLVpo0gUsugfnzYdOmMOFr6lRV+RIpZEr8khNHHBGqfI0eHUYADRkC774bd1Qiko4Sv+RMmzYwYwbccUdY779nT3j44bijEpGalPglp8zCLN8lS6C0FI49Noz+UZUvkcKhxC/1ols3eP55OOccuO026NsXVqyIOyoRASV+qUe77RZW+JwzB9auhfJymD5di72JxE2JX+rd8OHhwm+fPmG8/wknaMy/SJyU+CUvOnQIVb0uuSTM+j3kkDABTETyT4lf8qZx4zDZ629/g//8J/T733KLun5E8k2JX/JuwIBwtj94MEyaBD/4AXz4YdxRiSSHEr/EoqQEHn0Urr023PfqFUYBiUj9U+KX2DRqBGefHQq7mIXZv1dfreUeROqbEr/Erm/fsM7/scfC+efDiBGwfn3cUYkUr0wKsdxpZuvNbFlK2zVm9lpUbH2WmbWu5b2rooItL5uZFtiXWrVuHZZ7uPXWsODbQQeFi8AiknuZnPFPB4bVaJsLHOjuPYHXgSk7eP9Ady/LtECAJJcZnHIKLFoUDgSDB4dRQCrxKJJbO0387r4A2Fij7amotCLAC4RC6iI50bNnWNf/hBPgssvgqKOgsjLuqESKRy76+H8OPF7Law48ZWaLzWxCDr5LEqJVq7C8wz33wOLFUFYGs2fHHZVIccgq8ZvZhcBm4N5aNjnc3Q8GhgMTzaz/Dj5rgplVmFnFhg0bsglLisjxx4fE36kTjBwZRgF9+WXcUYk0bHVO/GY2HhgJ/NRrqdju7mui+/XALKBPbZ/n7tPcvdzdy0tKSuoalhShbt3C+v6nnQbXXw/f/S689VbcUYk0XHVK/GY2DDgf+L67f1bLNrub2R5Vj4GhwLJ024rsTPPmYXmHBx+E118PE75mzIg7KpGGKZPhnPcBzwPdzKzSzE4Cbgb2AOZGQzVvi7btYGZzore2Axaa2VJgETDb3Z+ol79CEuMHPwjLPXTvDscdF0YBqciLyK6xWnppYlVeXu4VFRr2L7X76iu46KIw0/c734EHHggHA5GkMrPFmQ6b18xdaZCaNoWpU+Hxx+G990KRl7vu0kqfIplQ4pcGbdiw0PXTp0+o9Xv88SryIrIzSvzS4KUWebnvPhV5EdkZJX4pCumKvNx8s7p+RNJR4peiklrk5fTTVeRFJB0lfik6VUVerrsu3JeVqciLSColfilKjRrBWWfBc8+FbqAjjgijgFTkRUSJX4pcnz6wZAmMGQOTJ6vIiwgo8UsCtG4dJnjddpuKvIiAEr8khBn84hcq8iICSvySMD17QkUFjB8firwMGqQiL5I8SvySOLvvHpZ3uOee0P9fVgaPPRZ3VCL5o8QviZVa5OWYY1TkRZJDiV8SrarIy8SJKvIiyaHEL4nXvHlY3mHmTBV5kWTIKPGb2Z1mtt7MlqW0tTWzuWa2MrpvU8t7x0fbrIzKNYoUpDFjwnIPPXqEIi+/+IWKvEhxyvSMfzowrEbbZGCeux8AzIueb8PM2gIXA30J9XYvru0AIVIISkthwQI47zyYNi1MAPu//4s7KpHcyijxu/sCYGON5lHA3dHju4HRad56NDDX3Te6+4fAXLY/gIgUlNQiL+vWQe/eKvIixSWbPv527r4WILrfO802HYF3Up5XRm0iBa+qyEvfviryIsWlvi/uWpq2tOdNZjbBzCrMrGLDhg31HJZIZjp0gLlzty3y8tJLcUclkp1sEv86M2sPEN2nW/qqEuic8rwTsCbdh7n7NHcvd/fykpKSLMISya2aRV769YM//EFdP9JwZZP4HwGqRumMB/6aZpsngaFm1ia6qDs0ahNpcAYMgKVLYcgQ+OUvwyggFXmRhijT4Zz3Ac8D3cys0sxOAn4HDDGzlcCQ6DlmVm5mfwZw943AZcCL0e3SqE2kQdprL3jkkVDkZfZsFXmRhsm8AH+vlpeXe0VFRdxhiOzQokUwdiy8/TZcfnkYAtpIUyIlJma22N3LM9lW/0xF6qhPn3Chd8wYmDIFhg8Pwz9FCp0Sv0gWvv716iIvCxaErp958+KOSmTHlPhFslSzyMuQIfDrX6vIixQuJX6RHPnOd6qLvFx+uYq8SOFS4hfJoaoiL//93yryIoVLiV+kHvzsZyHxd+4cirycdZaKvEjhUOIXqSff/GYY4z9pEtxwAxx+OLz5ZtxRiSjxi9Sr5s3D8g4zZ8Ibb4QiL5dfHi4Eb9kSd3SSVEr8InkwZkwY819eHkb89O0bZgEfeyzccgu89prW/pH8aRJ3ACJJUVoaFnpbvz7cP/10uD38cHi9Y0c46igYPDjcd+gQa7hSxLRkg0iM3ENx93nzwkHgb3+DDz4Ir3XvXn0QOPLIMFlMpDa7smSDEr9IAdm6NawAWnUgWLAg1P1t1ChUAqs6EBx2GDRrFne0UkiU+EWKxKZN8MIL4SAwb171ReEWLeC7360+EJSVhboBklxK/CJF6uOP4e9/rz4QLF8e2tu2DTOFq64R7L9/WEpCkkOJXyQh1q6tvlA8bx68E1W47tKl+tfAUUdBu3bxxin1T4lfJIHcYeXK6usDzzxTXSHswAOrDwQDBsAee8Qbq+ReXhK/mXUDHkhp2g/4jbvfmLLNkYSSjP+Kmh5y90t39tlK/CLZ27IlzB2o+jWwcCF88QU0aRJqCVQdCPr1g912iztayVbez/jNrDHwLtDX3VentB8JnOPuI3fl85T4RXLviy/gf/+3+kBQURFGEbVsCf37hwPB4MFhlVFVEmt4diXx52oC11HAm6lJX0QKS/Pm4QLwoEHh+Ucfwfz51QeCc84J7XvtVX1tYPBg6No1tpClnuTqjP9OYIm731yj/UhgJlAJrCGc/S+v5TMmABMAunTpcsjq1TqGiORTZeW2M4rXrg3tXbtWdwsNGgQlJfHGKenltavHzHYjJPVvu/u6Gq99Ddjq7p+a2QjgJnc/YGefqa4ekXi5h/WDqn4NPPNMGEoKcNBB1QeC/v1DDQKJX74T/yhgorsPzWDbVUC5u7+/o+2U+EUKy+bNsHhx9YHguedCfYGmTeHQQ6u7hXr3Dm2Sf/lO/PcDT7r7XWle2wdY5+5uZn2AB4F9fSdfqsQvUtg++ywk/6oDwZIl4VdCq1ZhXaGqA8G3v62JZPmSt4u7ZtYSGAL8IqXtFAB3vw34IXCqmW0GPgfG7izpi0jha9kyFJUfMiQ8/+CD6gvFTz9dXW6yXbttLxR36RJbyJJCE7hEJOdWrw6/BKomk61fH9q/8Y3q6wMDB8Kee8YbZzHRzF0RKRjuYU2hqm6h+fPh009DF9BBB0G3buGXQM1bmzbqJtoVSvwiUrC++gpefDEcCJ59FlatCmsMbdq07Xa7757+gFB169RJM45TxTGBS0QkI02bhnoChx1W3bZ1K2zYAG+/nf720kvV3UVVzGCffXZ8cNhzT/1qSEeJX0Ri16hRuBDcrl0YEprO55+HSWbpDgxLl8Kjj4ZlKVK1aLHzXw3Nm9f/31dolPhFpEFo0QIOOCDc0nGH99+v/VfD7Nnw3nvbv69dux0fHEpKiu9XgxK/iBQFs5CkS0rgkEPSb7NpU+2/GpYvhzlzwi+LVM2bQ+fO2x4M9t13218NLVrU/9+XS0r8IpIYzZqF6mT775/+dXfYuLH6YLB69bYHhyeeqF7DKNXee+/8V0MhrXiqxC8iEjELF4T33BN69Uq/zaZN8O676X81rFgBTz4J//nPtu9p1mz7Xw2pt86dw6S4fFHiFxHZBc2awX77hVs67qHyWW3XGubOhTVrwnap9toLvvWtMMS1vinxi4jkkBm0bRtuZWXpt/nqq/S/GrZsyU+MSvwiInnWtCmUloZbHArocoOIiOSDEr+ISMIo8YuIJIwSv4hIwmSd+M1slZm9amYvm9l2S2pa8Hsze8PMXjGzg7P9ThERqbtcjeoZuIM6usOBA6JbX+DW6F5ERGKQj66eUcA9HrwAtDaz9nn4XhERSSMXid+Bp8xssZlNSPN6R+CdlOeVUZuIiMQgF109h7v7GjPbG5hrZq+5+4KU19MtaLpd2a/ooFF14PjUzP6Zg9jitBdQW/dX0mhfbEv7Y1vaH9Wy2Rf7Zrph1onf3ddE9+vNbBbQB0hN/JVA55TnnYA1aT5nGjAt23gKhZlVZFoGrdhpX2xL+2Nb2h/V8rUvsurqMbPdzWyPqsfAUGBZjc0eAU6IRvf0A/7t7mkWNhURkXzI9oy/HTDLQnmaJsBf3P0JMzsFwN1vA+YAI4A3gM+AE7P8ThERyUJWid/d3wIOStN+W8pjByZm8z0NVNF0W+WA9sW2tD+2pf1RLS/7wrzmotAiIlLUtGSDiEjCKPFnyczuNLP1ZrYspa2tmc01s5XRfZs4Y8wnM+tsZs+Y2QozW25mZ0TtidsnZtbczBaZ2dJoX1wStXc1s39E++IBM9st7ljzycwam9lLZvZY9Dyx+yPdkjf5+L+ixJ+96cCwGm2TgXnufgAwL3qeFJuBs929O9APmGhmPUjmPtkEDHL3g4AyYFg0sm0qcEO0Lz4ETooxxjicAaxIeZ70/THQ3ctShnHW+/8VJf4sRZPVNtZoHgXcHT2+Gxid16Bi5O5r3X1J9PgTwn/wjiRwn0TLlHwaPW0a3RwYBDwYtSdiX1Qxs07A94A/R8+NBO+PWtT7/xUl/vrRrmquQnS/d8zxxMLMSoFewD9I6D6JujVeBtYDc4E3gY/cfXO0SdKWMLkROA/YGj3fk2Tvj3RL3tT7/xXV3JV6YWatgJnAme7+cTTXI3HcfQtQZmatgVlA93Sb5TeqeJjZSGC9uy82syOrmtNsmoj9EdluyZt8fKnO+OvHuqoVSKP79THHk1dm1pSQ9O9194ei5kTvE3f/CJhPuO7R2syqTrrSLmFSpA4Hvm9mq4D7CV08N5Lc/bHNkjeEE4M+5OH/ihJ//XgEGB89Hg/8NcZY8irqs70DWOHu16e8lLh9YmYl0Zk+ZtYCGEy45vEM8MNos0TsCwB3n+Lundy9FBgL/M3df0pC98cOlryp9/8rmsCVJTO7DziSsKreOuBi4GFgBtAFeBv4kbvXvABclMzsu8CzwKtU9+NeQOjnT9Q+MbOehItzjQknWTPc/VIz249wxtsWeAn4mbtvii/S/Iu6es5x95FJ3R/R3z0relq15M0VZrYn9fx/RYlfRCRh1NUjIpIwSvwiIgmjxC8ikjBK/CIiCaPELyKSMEr8IiIJo8QvIpIwSvwiIgnz/+LivKFW0ZnYAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xe7f7a58>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用最佳的K再次聚类，得到聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters = Best_K)\n",
    "\n",
    "y_pred = mb_kmeans.fit_predict(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4, 3, 2, ..., 4, 0, 0])"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 保存聚类的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存聚类结果\n",
    "feat_names_Kmeans = \"Kmeans_\" + str(Best_K)\n",
    "\n",
    "y = pd.Series(data = y, name = 'type')\n",
    "train_kmeans = pd.concat([pd.Series(name = feat_names_Kmeans, data = y_pred), y], axis = 1)\n",
    "train_kmeans.to_csv( 'company_classification_train_KMeans.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 保存KMeans模型，用于后续对测试数据的聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "import _pickle as cPickle\n",
    "\n",
    "cPickle.dump(mb_kmeans, open(\"mb_kmeans.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
